MAE Augmentation and assay transformation

Author

Laura Symul

Published

August 12, 2025

Code
library(tidyverse)
library(magrittr)
library(patchwork)
library(viridis)
library(gt)

tmp <- fs::dir_map("R", source)
tmp <- fs::dir_map("../R", source)

theme_set(theme_publication())
Code
mkdir -p lactinv_dropbox
rclone mount lactinv-dropbox:"/Gates LACTIN-V/" "lactinv_dropbox/"
Code
mae <- load_latest_mae(dir = str_c(data_dir(), "02_mae/"))
Loading mae_20250704.RDS
Code
raw_mae <- mae

In this document, we augment the MAE (MultiAssay Experiment) object with additional assays derived from the raw data.

Augmenting colData(mae)

For downstream analyses, it is convenient to add variables such as has_V0, has_V1, or has_V0_and_V1 to flag participants for which samples at Visit 0 (pre-MTZ) and/or Visit 1 (post-MTZ) are available.

Code
clin <- MultiAssayExperiment::colData(mae) |> as.data.frame()
clin <-
  clin |> 
  group_by(USUBJID) |> 
  mutate(
    has_V0 = any(AVISITN == 0),
    has_V1 = any(AVISITN == 1),
    has_V0_and_V1 = has_V0 & has_V1
  ) |> 
  ungroup()

MultiAssayExperiment::colData(mae)$has_V0 <- clin$has_V0
MultiAssayExperiment::colData(mae)$has_V1 <- clin$has_V1
MultiAssayExperiment::colData(mae)$has_V0_and_V1 <- clin$has_V0_and_V1

16S-derived assays

Filtering out low count ASVs

Code
filter_criteria <- 
  list(min_tot_count = 50, n_samples = 2, min_counts_per_sample = 5)

Sometimes, it is convenient to work with a “lighter” ASV table. To create that lighter table, we keep ASVs that have at least 5 counts in at least 2 samples and at least 50 counts across all samples.

Code
non_zero_ASVs <- 
  filter_out_low_count_ASVs(
    mae, assay_name = 'ASV_16S', 
    min_tot_count = filter_criteria$min_tot_count,
    min_n_samples = filter_criteria$n_samples, 
    min_counts_per_sample = filter_criteria$min_counts_per_sample
    )

non_zero_ASVs$plot
Warning in scale_y_log10(str_c("number of samples with at least ",
min_counts_per_sample, : log-10 transformation introduced infinite values.

Code
mae <- c(mae, non_zero_ASVs$SE)
Code
# statistics of what that does

n_all_ASV <- 
  MultiAssayExperiment::assay(mae,"ASV_16S") |> nrow()
n_non_zero_ASVs <- 
  SummarizedExperiment::assay(non_zero_ASVs$SE$ASV_16S_filtered) |> nrow()

perc_removed <- (100 * (1 - n_non_zero_ASVs/n_all_ASV)) |>  round()

counts_all_ASVs <- 
  MultiAssayExperiment::assay(mae,"ASV_16S") |> sum()
counts_non_zero_ASVs <- 
  SummarizedExperiment::assay(non_zero_ASVs$SE$ASV_16S_filtered) |> sum()

perc_counts_removed <- 
  (100 * (1 - counts_non_zero_ASVs/counts_all_ASVs)) |> round(2)

That removes 92% of ASVs, but only 0.33% of total counts.

Relative abundance assays

We add three assays with relative abundances:

  • one with the relative abundance of ASVs

  • one with the relative abundances of taxa (ASV counts aggregated by taxonomic level)

  • one with the relative abundances of taxa and where we distinguish between CTV-05 and non-CTV-05 Lactobacillus crispatus

Code
rel_abundances_ASV <- 
  get_relative_abundance_assay(mae, assay_name = "ASV_16S_filtered") 
mae <- c(mae, rel_abundances_ASV)

rel_abundances_tax <- get_relative_abundance_assay(mae, assay_name = "tax_16S") 
mae <- c(mae, rel_abundances_tax)
Code
ctv05_wide <- get_assay_wide_format(mae, "MG_CTV05")

props_with_ctv05 <- 
  ctv05_wide |> 
  select(Barcode, assay) |> 
  unnest(assay) |> 
  select(-`Non L. crispatus`) |> 
  dplyr::rename(
    "CTV05" = `CTV-05`, 
    "non-CTV05 Lactobacillus crispatus" = `non-CTV-05 L. crispatus`,
    "undetermined Lactobacillus crispatus" = `Undetermined L. crispatus`
    ) |> 
  full_join(
    get_assay_wide_format(mae, "tax_16S_p", add_colData = FALSE) |>
      unnest(assay) |> 
      select(-`Lactobacillus crispatus`),
    by = join_by(Barcode)
    ) |> 
  as.data.frame() |> 
  column_to_rownames("Barcode") |> 
  as.matrix()

props_with_ctv05 <- 
  props_with_ctv05 / rowSums(props_with_ctv05)
Code
rowdata <- 
  SummarizedExperiment::rowData(mae[["tax_16S"]]) |> 
  as.data.frame() 

j <- which(rownames(rowdata) == "Lactobacillus crispatus")
new_rowdata <- 
  bind_rows(
    rowdata[rep(j, 3), ] |> 
      mutate(
        strain = c("CTV-05", "non-CTV-05", "undetermined"),
        tax_label = str_c("Lactobacillus crispatus (", strain, ")"),
        key = tax_label
        ) |> 
      set_rownames(
        c(
          "CTV05", 
          "non-CTV05 Lactobacillus crispatus", 
          "undetermined Lactobacillus crispatus")
        ),
    rowdata[-j,]
  )
Code
new_assay <-  list()
new_assay[["tax_CTV05_p"]] <-
  SummarizedExperiment::SummarizedExperiment(
    assay = props_with_ctv05 |> t(),
    rowData = new_rowdata
  )
new_assay
$tax_CTV05_p
class: SummarizedExperiment 
dim: 222 1176 
metadata(0):
assays(1): ''
rownames(222): CTV05 non-CTV05 Lactobacillus crispatus ... Leptotrichia
  shahii Lautropia mirabilis
rowData names(13): Domain Phylum ... key strain
colnames(1176): 202302505 202302518 ... 202306743 203323517
colData names(0):
Code
mae <- c(mae, new_assay)

Prop Lacto and Categories

We create and add an assay that has the proportion of Lactobacillus in each sample

Code
props <- 
  get_assay_long_format(mae, "tax_16S_p", add_colData = FALSE)
prop_Lacto <- 
  props |> 
  mutate(
    category  = 
      case_when(
        str_detect(feature, "crispatus") ~ "L_crisp",
        str_detect(feature, "iners") ~ "L_iners",
        str_detect(feature, "Lactobacillus") ~ "other_Lacto",
        TRUE ~ "non_Lacto"
      )
  ) |> 
  group_by(Barcode, category) |> 
  summarize(prop = sum(value), .groups = "drop") |> 
  pivot_wider(id_cols = Barcode, names_from = category, values_from = prop,
              names_prefix = "prop_") |> 
  mutate(
    prop_Lacto = prop_other_Lacto + prop_L_crisp + prop_L_iners,
    prop_non_iners_Lacto = prop_Lacto - prop_L_iners
    ) |> 
  select(Barcode, prop_L_crisp, prop_L_iners, prop_other_Lacto, prop_Lacto, prop_non_iners_Lacto, prop_non_Lacto) |> 
  as.data.frame() 
prop_Lacto <- prop_Lacto |> set_rownames(prop_Lacto$Barcode) |> select(-Barcode)

prop_Lacto_assay <- list()
prop_Lacto_assay[["prop_Lacto"]] <- 
  SummarizedExperiment::SummarizedExperiment(
    assay = prop_Lacto |> t()
  )

mae <- c(mae, prop_Lacto_assay)

The distribution of Lactobacillus is as follows:

Code
prop_Lacto |> 
  ggplot(aes(x = prop_Lacto)) +
  geom_histogram(binwidth = 0.02) + 
  scale_x_continuous("Proportion of Lactobacillus", breaks = seq(0, 1, 0.1))

The distribution of Lactobacillus crispatus is:

Code
prop_Lacto |> 
  ggplot(aes(x = prop_L_crisp)) +
  geom_histogram(binwidth = 0.02) + 
  scale_x_continuous("Proportion of L. crispatus", breaks = seq(0, 1, 0.1))

And we also add an assay that categorize each target visit sample according to the endpoint definition.

Code
categories_levels <- 
  c(
    "≥ 50% L. crispatus", 
    "≥ 50% Lactobacillus, < 50% L. crispatus", 
    "< 50% Lactobacillus" 
  )

categories <- 
  get_assay_wide_format(mae, "prop_Lacto") |> 
  mutate(
    category = 
      case_when(
        (AVISITN >= 0) & (assay$prop_L_crisp >= 0.5) ~ categories_levels[1],
        (AVISITN >= 0) & (assay$prop_Lacto >= 0.5) ~ categories_levels[2],
        (AVISITN >= 0) ~ categories_levels[3],
        TRUE ~ NA_character_
      ) |> factor(levels = categories_levels)
  ) |> 
  select(Barcode, category) |> 
  as.data.frame() |> 
  column_to_rownames("Barcode")

categories_assay <- list()
categories_assay[["mb_categories"]] <- 
  SummarizedExperiment::SummarizedExperiment(
    assay = categories |> t()
  )

mae <- c(mae, categories_assay)

The distribution of the microbiota categories is as follow:

Code
ggplot(categories |> filter(!is.na(category)), aes(y = category)) + 
  geom_bar() +
  ylab("") + xlab("Number of samples")

Code
categories_wide <-
  categories |>
  rownames_to_column() |>
  mutate(tmp = (!is.na(category)) * 1) |>
  pivot_wider(names_from = category, values_from = tmp, values_fill = 0) |> 
  select(-any_of("NA")) |> 
  mutate(across(all_of(categories_levels), .f = function(x){x; x[is.na(categories$category)] <- NA; x})) |> 
  as.data.frame() |> 
  column_to_rownames() |> 
  select(all_of(categories_levels))


categories_wide_assay <- list()
categories_wide_assay[["mb_categories_wide"]] <- 
  SummarizedExperiment::SummarizedExperiment(
    assay = categories_wide |> t()
  )

mae <- c(mae, categories_wide_assay)

CST and sub-CST assignments

We add an assay with the CST and sub-CST assignments (VALENCIA) for each sample.

Code
counts <- MultiAssayExperiment::assay(mae, "tax_16S") |> t()

library(ValenciaR) # devtools::install_github("lasy/ValenciaR")

counts_Valencia <- 
  ValenciaR::convert_to_Valencia_taxonomy(
    input = counts, 
    tax_table = SummarizedExperiment::rowData(mae[["tax_16S"]]) |> as.data.frame()
  )

assignments <- 
  ValenciaR::assign_to_Valencia_clusters(
    input = counts_Valencia$converted_input,
    distance = "YC"
  )
Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : Assuming count data (as opposed to proportions) were provided.
Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : `input` does not have counts/proportions for all taxa represented in Valencia.
Data for 102/199 taxa are missing.
These missing taxa represent around 17% (1-83%) of Valencia cluster composition.
Their list is in the `missing_taxa` component of the returned value.
Code
CST_assay <- list()
CST_assay[["CST"]] <- 
  SummarizedExperiment::SummarizedExperiment(
    assay = assignments$assignment |> t()
  )

mae <- c(mae, CST_assay)

The distribution of CST/sub-CSTs is as follow:

Code
assignments$assignment |> 
  ggplot(aes(y = subCST |> fct_rev())) +
  geom_bar() +
  facet_grid(CST ~ ., scales = "free", space = "free") +
  ylab("CST/subCST") +
  theme(strip.text.y = element_text(angle = 0))

And the distribution of dissimilarities between samples and assigned subCST centroids is as follow:

Code
assignments$assignment |> 
  ggplot(aes(x = distance_to_subCST)) +
  geom_histogram(binwidth = 0.02) +
  facet_grid(subCST ~ ., scales = "free") +
    stat_summary(
    aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)), 
    fun.y = median, geom = "vline", col = "red"
    ) +
  theme(strip.text.y = element_text(angle = 0)) +
  xlab("Dissimilarity (Yue-Clayton) to subCST centroid")
Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
ℹ Please use the `fun` argument instead.
Warning: `stat(y)` was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(y)` instead.

Code
most_prevalent_genus <- 
  get_assay_long_format(mae, "tax_16S_p", add_colData = FALSE) |> 
  left_join(
    SummarizedExperiment::rowData(mae[["tax_16S_p"]]) |> 
      as.data.frame() |>
      rownames_to_column("feature") |> 
      as_tibble(),
    by = join_by(feature)
  ) |> 
  group_by(Barcode, Genus) |> 
  summarize(rel_ab = sum(value), .groups = "drop") |> 
  arrange(Barcode, -rel_ab) |> 
  group_by(Barcode) |>
  slice_head(n = 1) |> 
  ungroup() |> 
  dplyr::rename(most_prev_genus = Genus) |> 
  arrange(-rel_ab) |>
  mutate(most_prev_genus = most_prev_genus |> str_replace("Candidatus Lachnocurva", "Ca. Lachn. (BVAB1)") |> fct_inorder()) |> 
  arrange(Barcode) |> 
  select(-rel_ab)
Code
assignments$assignment |> 
  rownames_to_column("Barcode") |> 
  left_join(
    most_prevalent_genus, 
    by = join_by(Barcode)
  ) |>
  ggplot(aes(x = distance_to_subCST)) +
  geom_histogram(binwidth = 0.02) +
    stat_summary(
    aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)), 
    fun.y = median, geom = "vline", col = "red"
    ) +
  facet_grid(most_prev_genus ~ ., scales = "free") +
  theme(strip.text.y = element_text(angle = 0)) +
  xlab("Dissimilarity (Yue-Clayton) to subCST centroid")

Code
most_prevalent_genus |> 
  dplyr::count(most_prev_genus) |> 
  mutate(`%` = (100 * n/sum(n)) |> round()) |>
  gt(caption = "Number and % of sample per most prevalent genus")
Number and % of sample per most prevalent genus
most_prev_genus n %
Lactobacillus 644 55
Gardnerella 245 21
Prevotella 183 16
Streptococcus 2 0
Ca. Lachn. (BVAB1) 75 6
Sneathia 10 1
Mobiluncus 2 0
Fannyhessea 10 1
Haemophilus 1 0
Actinomyces 1 0
Gemella 1 0
Code
assignments_BC <- 
  ValenciaR::assign_to_Valencia_clusters(
    input = counts_Valencia$converted_input,
    distance = "BC"
  )
Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : Assuming count data (as opposed to proportions) were provided.
Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : `input` does not have counts/proportions for all taxa represented in Valencia.
Data for 102/199 taxa are missing.
These missing taxa represent around 17% (1-83%) of Valencia cluster composition.
Their list is in the `missing_taxa` component of the returned value.
Code
assignments_BC$assignment |> 
  ggplot(aes(x = distance_to_subCST)) +
  geom_histogram(binwidth = 0.02) +
  facet_grid(subCST ~ ., scales = "free") +
    stat_summary(
    aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)), 
    fun.y = median, geom = "vline", col = "red"
    ) +
  theme(strip.text.y = element_text(angle = 0)) +
  xlab("Dissimilarity (Bray-Curtis) to subCST centroid")

Code
assignments_BC$assignment  |> 
  rownames_to_column("Barcode") |> 
  left_join(
    most_prevalent_genus, 
    by = join_by(Barcode)
  ) |>
  ggplot() +
  aes(x = distance_to_subCST) +
  facet_grid(most_prev_genus ~ ., scales = "free") +
  geom_histogram(aes(y = ..density..), binwidth = 0.02) +
  stat_summary(
    aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)), 
    fun.y = median, geom = "vline", col = "red"
    ) +
  theme(strip.text.y = element_text(angle = 0)) +
  xlab("Dissimilarity (Bray-Curtis) to subCST centroid")
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

We note that for many non-Lactobacillus-dominated subCSTs, the dissimilarity to the centroid is quite high, which suggests that these subCSTs are not well defined in this cohort.

Code
assignments_BC$assignment |> 
  mutate(
    `BC dissimilarity to centroid` = ifelse(distance_to_subCST > 0.5, "> 0.5", "≤ 0.5")
  ) |> 
  dplyr::count(`BC dissimilarity to centroid`) |> 
  mutate(`%` = (100 * n/sum(n)) |> round()) |>
  gt()
BC dissimilarity to centroid n %
> 0.5 188 16
≤ 0.5 986 84

The median BC dissimilarity to centroids is 0.34.

Code
tmp <- 
assignments$distances |> 
  as.data.frame() |> 
  rownames_to_column("Barcode") |>
  pivot_longer(-Barcode, names_to = "subCST", values_to = "distance") |>
  group_by(Barcode) |> 
  arrange(Barcode, distance) |> 
  slice_head(n = 2) |> 
  summarize(
    diff_dist = distance[2] - distance[1], 
    distance_with_next_closest_subCST = distance[2],
    .groups = "drop"
    ) |> 
  left_join(
    assignments$assignment |> rownames_to_column("Barcode"), by = join_by(Barcode)
  ) |> 
  group_by(subCST) |>
  mutate(goodness_of_assignment = (diff_dist - 2 * distance_to_subCST)/max(distance_to_subCST)) |> 
  ungroup() 

tmp |>
  ggplot() +
  aes(x = distance_to_subCST, y = distance_with_next_closest_subCST, col = goodness_of_assignment) +
  facet_wrap(subCST ~ .) +
  geom_abline(intercept = 0, slope = 1) +
  geom_abline(intercept = 0.05, slope = 1.2, linetype = 3, col = "red", alpha = 0.5) +
  geom_point(alpha = 0.5) +
  scale_color_gradient("Goodness of\nsubCST assignment", low = "red", high = "steelblue1") +
  xlab("YC dissimilarity to closest subCST") +
  ylab("YC dissimilarity to next closest subCST") 

Code
tmp <- 
assignments_BC$distances |> 
  as.data.frame() |> 
  rownames_to_column("Barcode") |>
  pivot_longer(-Barcode, names_to = "subCST", values_to = "distance") |>
  group_by(Barcode) |> 
  arrange(Barcode, distance) |> 
  slice_head(n = 2) |> 
  summarize(
    diff_dist = distance[2] - distance[1], 
    distance_with_next_closest_subCST = distance[2],
    .groups = "drop"
    ) |> 
  left_join(
    assignments_BC$assignment |> rownames_to_column("Barcode"), by = join_by(Barcode)
  ) |> 
  group_by(subCST) |>
  mutate(goodness_of_assignment = (diff_dist - 2 * distance_to_subCST)/max(distance_to_subCST)) |> 
  ungroup() 

tmp |>
  ggplot() +
  aes(x = distance_to_subCST, y = distance_with_next_closest_subCST, col = goodness_of_assignment) +
  facet_wrap(subCST ~ .) +
  geom_abline(intercept = 0, slope = 1) +
  geom_abline(intercept = 0.05, slope = 1.2, linetype = 3, col = "red", alpha = 0.5) +
  geom_point(alpha = 0.5) +
  scale_color_gradient("Goodness of\nsubCST assignment", low = "red", high = "steelblue1") +
  xlab("BC dissimilarity to closest subCST") +
  ylab("BC dissimilarity to next closest subCST") 

Code
tmp <- 
  tmp |> 
  mutate(
    `Distance to centroids` = 
      ifelse(
        distance_with_next_closest_subCST < (0.05 + 1.2 * distance_to_subCST), 
        "Almost equidistant to two CTS/subCSTs",
        "Closer to assigned CST/subCST"
      )
    
  )

tmp |> 
  dplyr::count(`Distance to centroids`) |> 
  mutate(`%` = (100 * n/sum(n)) |> round()) |> 
  gt::gt()
Distance to centroids n %
Almost equidistant to two CTS/subCSTs 464 40
Closer to assigned CST/subCST 710 60

We also note that, in addition to being very far from their respective subCST centroids, about 40% of samples are almost equally close to their assigned subCST than to the next closest subCST (below the dashed line). This proportion varies widely by CST/subCSTs:

Code
tmp |> 
  group_by(subCST) |> 
  dplyr::count(`Distance to centroids`) |> 
  mutate(`%` = (100 * n/sum(n)) |> round()) |> 
  gt::gt(row_group_as_column = TRUE)
Distance to centroids n %
I-A Almost equidistant to two CTS/subCSTs 8 5
Closer to assigned CST/subCST 168 95
I-B Almost equidistant to two CTS/subCSTs 45 90
Closer to assigned CST/subCST 5 10
II Almost equidistant to two CTS/subCSTs 1 100
III-A Almost equidistant to two CTS/subCSTs 35 19
Closer to assigned CST/subCST 154 81
III-B Almost equidistant to two CTS/subCSTs 107 69
Closer to assigned CST/subCST 48 31
IV-A Almost equidistant to two CTS/subCSTs 66 37
Closer to assigned CST/subCST 114 63
IV-B Almost equidistant to two CTS/subCSTs 190 50
Closer to assigned CST/subCST 189 50
IV-C0 Almost equidistant to two CTS/subCSTs 4 100
IV-C1 Almost equidistant to two CTS/subCSTs 1 33
Closer to assigned CST/subCST 2 67
V Almost equidistant to two CTS/subCSTs 7 19
Closer to assigned CST/subCST 30 81

For the non-Lactobacillus dominated samples (i.e., samples in which the proportion of Lactobacillus is < 80%), that proportion is even higher.

Code
tmp |> 
  left_join(
    get_assay_long_format(mae, "prop_Lacto", add_colData = FALSE) |> 
      filter(feature == "prop_Lacto") |> 
      dplyr::rename(prop_Lacto = value) |> 
      select(Barcode, prop_Lacto), 
    by = join_by(Barcode)
  ) |> 
  mutate(
    Lacto_group = ifelse(prop_Lacto >= 0.5, "≥ 50% Lactobacillus", "< 50% Lactobacillus")
    ) |>
  group_by(Lacto_group) |>
  dplyr::count(`Distance to centroids`) |> 
  mutate(`%` = (100 * n/sum(n)) |> round()) |> 
  gt::gt(row_group_as_column = TRUE)
Distance to centroids n %
< 50% Lactobacillus Almost equidistant to two CTS/subCSTs 281 48
Closer to assigned CST/subCST 307 52
≥ 50% Lactobacillus Almost equidistant to two CTS/subCSTs 183 31
Closer to assigned CST/subCST 403 69

Topic models of the taxa counts

Analyses above show that many samples do not fit well into the CST/subCST framework.

An alternative to classifying samples so single CST/subCST is to use topic models, which are mixed membership models that allow each sample to be a mixture of subcommunities (a mixture of several topics) (see Sankaran and Holmes, 2019 and Symul et al., 2023).

We fit the topics de novo, that is we do not use “reference” subcommunities, but rather identify these subcommunities from the data.

We do this using the alto package, which is a wrapper around the lda package, and which allows to examine how subcommunities relate to each other across models with different number of subcommunities.

So, we first fit the topic models for different number of subcommunities (K), and then we label the sub-communities according to their taxonomic composition and such that their label match their most similar subCST.

Code
counts <- MultiAssayExperiment::assay(mae, "tax_16S") |> t()
max_K <- 15

We fit the topic models for K in 1 to 15.

Code
library(alto) # devtools::install_github("lasy/alto")

lda_models <- 
  run_lda_models(
    data = counts, 
    lda_varying_params_lists = 
      setNames(purrr::map(1:max_K, ~ list(k = .)), 1:max_K),
    dir = str_c(data_dir(),"/03_augmented_mae/lda_models/"),
    reset = FALSE, 
    verbose = TRUE
  )


valencia_centroids_mat <- 
  ValenciaR::get_Valencia_clusters()

labelled_lda_models <- 
  label_models_topics(
    lda_models = lda_models,
    tax_table = 
      SummarizedExperiment::rowData(mae[["tax_16S"]]) |> as.data.frame(),
    valencia_centroids_mat = valencia_centroids_mat,
    distance = "YC"
  )

The alignments of topics look like this:

Code
aligned_topics_transport <- 
  align_topics(models = labelled_lda_models, method = "transport")

plot(aligned_topics_transport) +
  labs(title = '"Transport" alignment',
       subtitle = "(= based on the similarities of sub-community compositions)",
       xlab = "Number of sub-communities (topics)")

Code
aligned_topics_product <- 
  align_topics(models = labelled_lda_models, method = "product")

plot(aligned_topics_product) +
  labs(title = '"Product" alignment',
       subtitle = "(= based on the similarities of sample compositions)",
       xlab = "Number of sub-communities (topics)")

One of the metrics used to identify an optimal number of topic is the number of path identified throughout K.

Code
topics <- 
  bind_rows(
    aligned_topics_transport@topics |> mutate(method = "transport"),
    aligned_topics_product@topics |> mutate(method = "product")
  )

plot_number_of_paths(compute_number_of_paths(aligned_topics_transport)) +
  ggtitle("Transport") +
plot_number_of_paths(compute_number_of_paths(aligned_topics_product)) +
  ggtitle("Product")

This suggests that there might be 7 or 9 true sub-communities.

We can also examine the distribution of coherence and refinement scores:

Code
ggplot(topics, aes(x = m, y = coherence, fill = method, col = method)) +
  geom_boxplot(alpha = 0.5) +
ggplot(topics, aes(x = m, y = refinement, fill = method, col = method)) +
  geom_abline(slope = 1, intercept = 0) +
  geom_boxplot(alpha = 0.5) +
  plot_layout(guides = "collect")
Warning: Removed 30 rows containing non-finite outside the scale range
(`stat_boxplot()`).

We see a first drop around K = 4-5, and a second drop at K = 8-9.

We can check the sub-communities composition for a selection of Ks:

Code
plot_beta(aligned_topics_transport, models = c(5, 6, 7, 9)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

From these plots, it looks like K = 7 might be a good compromise between dimension reduction, representation accuracy, and minimizing the number of spurious topics.

We note that at none of these resolutions (K = 5, 6, 7, 9) do we find any topic similar to the subCSTs IV-C(0-4).

Optimal topic model

We store the sample topic-mixtures and the topic composition as a SE in the MAE

Code
optimal_K <- 7
topic_SE <- 
  make_topic_SE(
    assay_name = str_c("topics_16S_", optimal_K), 
    lda_model = labelled_lda_models[[optimal_K]]
  )
mae <- c(mae, topic_SE)

“Constrained” Topic models of the taxa counts

One of the problem with the topic models fitted above is that it may happen that non-Lacto species are mixed with Lacto species in the topic models. This may reflect actual biology, which would be interesting to study, but this may also simply correspond to a data-set specific “statistical optimum” that does not have a biological interpretation.

The main “issue” is with the topic V dominated by L. jensenii:

Code
beta_long <- get_beta_long(mae, assayname = "topics_16S_7")
beta_long |> 
  filter(topic_subcst_matching_label %in% c("I","III","V")) |> 
  filter(prop > 1/500) |>
  arrange(topic, -prop) |>
  mutate(taxa = taxa |> factor(levels = unique(taxa))) |> 
  ggplot() +
  aes(y = taxa |> fct_rev(), x = prop, fill = topic) +
  geom_col() +
  facet_grid(. ~ topic_label) +
  guides(fill = "none") +
  ylab("") + xlab("proportion in topic") +
  scale_fill_viridis(
    discrete = TRUE, option = "A", direction = -1, end = 0.85, begin = 0.2
    )

One potential simple solution is to constraint “Lactobacillus topics” to only contain Lactobacillus species and fit the topic model on the remaining non-Lactobacillus species.

Lactobacillus topics

Code
counts <- MultiAssayExperiment::assay(mae, "tax_16S") |> t()
props <- MultiAssayExperiment::assay(mae, "tax_16S_p") |> t()

props_long <- 
  get_assay_long_format(
    mae, "tax_16S_p", add_colData = FALSE,
    feature_name = "taxa", values_name = "prop")
Code
props_long_lacto <- 
  props_long |>  
  filter(str_detect(taxa, "Lactobacillus")) |> 
  group_by(taxa) |>
  mutate(median_prop = median(prop)) |> 
  ungroup() |> 
  arrange(-median_prop) |>
  mutate(
    taxa = taxa |> str_replace("Lactobacillus", "L."),
    taxa = taxa |> factor(levels = unique(taxa))
    )

# props_long_lacto |> 
#   ggplot(aes(x = taxa, y = prop)) +
#   geom_boxplot(outlier.alpha = 0.25, outlier.size = 0.5, fill = "antiquewhite3", col = "antiquewhite4") +
#   xlab("Taxa") 

g <- 
props_long_lacto |> 
  ggplot(aes(y = taxa |> fct_rev(), x = prop, col = taxa)) +
  geom_jitter(width = 0, height = 0.15, size  = 0.5) +
  xlab("Proportion in samples") + ylab("") +
  scale_color_manual(values = get_topic_colors(c("III","I", "V", rep("VI", 7)))) +
  guides(col = "none")

g

Code
ggsave(g, filename = str_c(fig_out_dir(), "S2C.pdf"), width = 10, height = 3, device = cairo_pdf)

Among all Lactobacillus species, there are only 3 making more than 50% of a sample in at least 10 samples: Lactobacillus iners, crispatus, and jensenii.

So, we’ll create 4 Lactobacillus topics:

  • topic I : 100% L. crispatus

  • topic II : 100% L. iners

  • topic V: 100% L. jensenii

  • topic VI: mix of the remaining Lactobacillus species

Code
Lacto_topics_beta <- 
  matrix(0, nrow = ncol(counts), ncol = 4) |> 
  set_colnames(c("I","III","V","VI")) |> 
  set_rownames(colnames(counts))

Lacto_topics_beta["Lactobacillus crispatus","I"] <- 1
Lacto_topics_beta["Lactobacillus iners","III"] <- 1
Lacto_topics_beta["Lactobacillus jensenii","V"] <- 1

all_Lacto_species <- str_subset(colnames(props), "Lactobacillus")
other_Lacto_species <- 
setdiff(
  all_Lacto_species, 
  str_c("Lactobacillus ", c("crispatus","iners","jensenii"))
  )

Lacto_topics_beta[other_Lacto_species,"VI"] <- 
  colSums(props[,other_Lacto_species])/sum(props[,other_Lacto_species])
Code
Lacto_topics_beta |> 
  as.data.frame() |> 
  rownames_to_column("Taxa") |>
  pivot_longer(-Taxa, names_to = "topic", values_to = "prop") |>
  filter(prop > 0) |>
  arrange(topic, -prop) |>
  mutate(Taxa = Taxa |> factor(levels = unique(Taxa))) |> 
  ggplot(aes(y = Taxa |> fct_rev(), x = topic)) +
  geom_tile(aes(alpha = prop), fill = "#F56172") +
  geom_text(aes(label = round(prop,2)), size = 4) +
  ylab("") + xlab("Topic") 

Code
Lacto_topics_gamma <- 
  cbind(
    props[, str_c("Lactobacillus ", c("crispatus","iners","jensenii"))], 
    rowSums(props[, other_Lacto_species])
  ) |> 
  set_colnames(c("I","III","V","VI"))

The distribution of these topics in samples is as follow:

Code
Lacto_topics_gamma |> 
  as.data.frame() |> 
  rownames_to_column("sample") |>
  pivot_longer(-sample, names_to = "topic", values_to = "prop") |>
  ggplot(aes(x = prop)) +
  geom_histogram(bins = 50) +
  facet_grid(topic ~ ., labeller = label_both) +
  theme(strip.text.y = element_text(angle = 0))

Non-Lactobacillus topics

We now fit the topic models for all non-Lactobacillus species. We do this by fitting topic models on the counts of non-Lactobacillus species for K = 1 to 15.

Code
# we remove the Lacto species
non_Lacto_counts <- counts[, !(colnames(counts) %in% all_Lacto_species)]
# we remove the samples that only have Lacto species
non_Lacto_counts <- non_Lacto_counts[rowSums(non_Lacto_counts) != 0, ]

max_K <- 15

library(alto) # devtools::install_github("lasy/alto")

lda_models <- 
  run_lda_models(
    data = non_Lacto_counts, 
    lda_varying_params_lists = 
      setNames(purrr::map(1:max_K, ~ list(k = .)), 1:max_K),
    dir = str_c(data_dir(), "03_augmented_mae/lda_models_nonLacto/"),
    reset = FALSE, 
    verbose = TRUE
  )

tax_table <- 
  SummarizedExperiment::rowData(mae[["tax_16S"]]) |> 
  as.data.frame()
tax_table <- tax_table[colnames(non_Lacto_counts), ]

labelled_lda_models <- 
  label_models_topics(
    lda_models = lda_models,
    tax_table = tax_table,
    valencia_centroids_mat = ValenciaR::get_Valencia_clusters(),
    distance = "YC"
  )

Just as we did for the topic models above, we can compute the alignment of topics across K and plot the coherence and refinement scores of topics as a function of K.

Code
aligned_topics_transport <- 
  align_topics(models = labelled_lda_models, method = "transport")

plot(aligned_topics_transport) +
  labs(title = '"Transport" alignment',
       subtitle = "(= based on the similarities of sub-community compositions)",
       xlab = "Number of sub-communities (topics)")

Code
aligned_topics_product <- 
  align_topics(models = labelled_lda_models, method = "product")

plot(aligned_topics_product) +
  labs(title = '"Product" alignment',
       subtitle = "(= based on the similarities of sample compositions)",
       xlab = "Number of sub-communities (topics)")

To identify the optimal K, we display the diagnostics scores (number of path, refinement, coherence) across resolution.

Code
topics <- 
  bind_rows(
    aligned_topics_transport@topics |> mutate(method = "transport"),
    aligned_topics_product@topics |> mutate(method = "product")
  )

plot_number_of_paths(compute_number_of_paths(aligned_topics_transport)) +
  ggtitle("Transport") +
plot_number_of_paths(compute_number_of_paths(aligned_topics_product)) +
  ggtitle("Product")

Code
g <- 
plot(aligned_topics_transport) +
  labs(
    title = 'Topic alignment across K',
    subtitle = '"Transport" alignment, based on topic similarities'
  ) +
  xlab("K: Number of topics") +
  
  plot_number_of_paths(compute_number_of_paths(aligned_topics_product)) +
   labs(
    title = 'Number of path across K',
    subtitle = 'Alignment based on sample similarities'
  )  +
  xlab("K") +
  theme_publication() +
  plot_layout(widths = c(1.4, 1)) + 
  plot_annotation(tag_levels = "A")

g

Code
ggsave(g, filename = str_c(fig_out_dir(), "S2AB.pdf"), width = 13, height = 6, device = cairo_pdf)
Code
ggplot(topics, aes(x = m, y = coherence, fill = method, col = method)) +
  geom_boxplot(alpha = 0.5) +
ggplot(topics, aes(x = m, y = refinement, fill = method, col = method)) +
  geom_abline(slope = 1, intercept = 0) +
  geom_boxplot(alpha = 0.5) +
  plot_layout(guides = "collect")
Warning: Removed 30 rows containing non-finite outside the scale range
(`stat_boxplot()`).

The composition of topics at various resolutions can be visualized as follows:

Code
plot_beta(aligned_topics_transport, models = c(4, 5, 6, 11)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 

Combining Lactobacillus and non-Lactobacillus topics

From these visualizations, it looks like K = 4 or K = 5 would be good choices for the number of non-Lactobacillus topics.

We will thus save two topic models: one with K = 4 and one with K = 5, which we both combine with the “Lactobacillus topics” defined above. We then store these two models as a SE in the MAE.

K = 4

K = 4 is a good choice because it is a stable number of path across many K when relying on the product alignment.

Code
combined_topics_K4 <- 
  combine_topics(4, labelled_lda_models, Lacto_topics_gamma, Lacto_topics_beta)

topic_SE <- 
  make_topic_SE(
    assay_name = str_c("c_topics_16S_", nrow(combined_topics_K4$beta)), 
    lda_model = combined_topics_K4
  )

mae <- c(mae, topic_SE)
Code
plot_topic_betas(mae, "c_topics_16S_8") +
  xlab("topic number")

K = 5

K = 5 might also a good choice because the transport alignment suggests that these 5 topics are quite well preserved at higher resolutions. It also highlights a little better the co-occurence of Fannyhessea vaginae (prev. Atopobium vaginae) and G. swidsinskii/leopoldii and/or G. vaginalis (but not with G. piotii), which has been observed in previous cohorts (see Symul et al, 2023).

Code
combined_topics_K5 <- 
  combine_topics(5, labelled_lda_models, Lacto_topics_gamma, Lacto_topics_beta)

topic_SE <- 
  make_topic_SE(
    assay_name = str_c("c_topics_16S_", nrow(combined_topics_K5$beta)), 
    lda_model = combined_topics_K5
  )

mae <- c(mae, topic_SE)
Code
plot_topic_betas(mae, "c_topics_16S_9")

Code
taxa_props <- get_assay_wide_format(mae, "tax_16S_p")

taxa_props |> 
  select(Barcode, assay) |> 
  unnest(cols = c(assay)) |> 
  select(Barcode, contains("Gardnerella") | contains("Fannyhessea")) |> 
  pivot_longer(cols = contains("Gardnerella"), 
               names_to = "Gardnerella", values_to = "prop") |>
  group_by(Gardnerella) |> 
  mutate(cor = cor(prop, `Fannyhessea vaginae`)) |> 
  ungroup() |> 
  ggplot(aes(x = `Fannyhessea vaginae`, y = prop, col = Gardnerella)) +
  geom_point(size = 0.5, alpha = 0.5) +
  facet_grid(. ~ Gardnerella) +
  guides(col = "none")

However, we note here that the proportions of G. piotii are very low in this cohort so we do not observe strong mutual exclusions with Fannyhessea.

Since both models (K = 4 or K = 5) are saved, we can use either of them in downstream analyses.

“Constrained” topic models with CTV05

We divide topic I defined above into two topics: one 100% composed of the CTV-05 strain and one 100% composed of the other Lactobacillus crispatus strains.

  • Topic I.a: CTV-05

  • Topic I.b: non-CTV-05 Lactobacillus crispatus

  • Topic I.c: undetermined Lactobacillus crispatus

Code
Lacto_topics_beta <- 
  matrix(0, nrow = ncol(counts) - 1, ncol = 6) |> 
  set_rownames(colnames(counts) |> str_subset("crispatus", negate = TRUE))

Lacto_topics_beta <- 
  matrix(0, nrow = 3, ncol = 6) |> 
  set_rownames(str_c("Lactobacillus crispatus (", c("CTV05", "non-CTV05", "?"), ")")) |> 
  rbind(Lacto_topics_beta) |> 
  set_colnames(c("I.a", "I.b", "I.c","III","V","VI")) 

Lacto_topics_beta["Lactobacillus crispatus (CTV05)","I.a"] <- 1
Lacto_topics_beta["Lactobacillus crispatus (non-CTV05)","I.b"] <- 1
Lacto_topics_beta["Lactobacillus crispatus (?)","I.c"] <- 1
Lacto_topics_beta["Lactobacillus iners","III"] <- 1
Lacto_topics_beta["Lactobacillus jensenii","V"] <- 1

all_Lacto_species <- str_subset(colnames(props), "Lactobacillus")
other_Lacto_species <- 
setdiff(
  all_Lacto_species, 
  str_c("Lactobacillus ", c("crispatus","iners","jensenii"))
  )

Lacto_topics_beta[other_Lacto_species,"VI"] <- 
  colSums(props[,other_Lacto_species])/sum(props[,other_Lacto_species])
Code
Lacto_topics_beta |> 
  as.data.frame() |> 
  rownames_to_column("Taxa") |>
  pivot_longer(-Taxa, names_to = "topic", values_to = "prop") |>
  filter(prop > 0) |>
  arrange(topic, -prop) |>
  mutate(Taxa = Taxa |> factor(levels = unique(Taxa))) |> 
  ggplot(aes(y = Taxa |> fct_rev(), x = topic)) +
  geom_tile(aes(alpha = prop), fill = "#F56172") +
  geom_text(aes(label = round(prop,2)), size = 4) +
  ylab("") + xlab("Topic") 

Code
ctv05_wide <- get_assay_wide_format(mae, "MG_CTV05")

j <- which(colnames(props) == "Lactobacillus crispatus")
props_with_ctv05 <- 
  ctv05_wide |> 
  select(Barcode, assay) |> 
  unnest(assay) |> 
  select(-`Non L. crispatus`) |> 
  dplyr::rename(
    "Lactobacillus crispatus (CTV05)" = `CTV-05`, 
    "Lactobacillus crispatus (non-CTV05)" = `non-CTV-05 L. crispatus`,
    "Lactobacillus crispatus (?)" = `Undetermined L. crispatus`
    ) |> 
  inner_join(
    props[, -j] |> as.data.frame() |> rownames_to_column("Barcode"),
    by = join_by(Barcode)
    ) |> 
  as.data.frame() |> 
  column_to_rownames("Barcode") |> 
  as.matrix()

props_with_ctv05 <- 
  props_with_ctv05 / rowSums(props_with_ctv05)
Code
pure_lacto_topics <- 
  c(
    "Lactobacillus crispatus (CTV05)",
    "Lactobacillus crispatus (non-CTV05)",
    "Lactobacillus crispatus (?)",
    "Lactobacillus iners",
    "Lactobacillus jensenii"
  )

Lacto_topics_gamma <- 
  cbind(
    props_with_ctv05[, pure_lacto_topics], 
    rowSums(props_with_ctv05[, other_Lacto_species])
  ) |> 
  set_colnames(c("I.a", "I.b", "I.c","III","V","VI"))

The distribution of these topics in samples is as follow:

Code
Lacto_topics_gamma |> 
  as.data.frame() |> 
  rownames_to_column("sample") |>
  pivot_longer(-sample, names_to = "topic", values_to = "prop") |>
  ggplot(aes(x = prop)) +
  geom_histogram(bins = 50) +
  facet_grid(topic ~ ., labeller = label_both) +
  theme(strip.text.y = element_text(angle = 0))

Combining Lactobacillus and non-Lactobacillus topics

Code
combined_topics_K4 <- 
  combine_topics(4, labelled_lda_models, Lacto_topics_gamma, Lacto_topics_beta)

topic_SE <- 
  make_topic_SE(
    assay_name = str_c("c_topics_16S_", nrow(combined_topics_K4$beta), "_ctv05"), 
    lda_model = combined_topics_K4
  )

mae <- c(mae, topic_SE)
Code
plot_topic_betas(mae, "c_topics_16S_10_ctv05")

Cytokine assay data transformation

In this section, we filter and transform cytokine data.

Code
cytok <- 
  get_assay_long_format(
    mae, "cytokine", add_colData = TRUE,
    feature_name = "cytokine", values_name = "concentration"
    )

Exploratory analyses

Code
g_raw_cytok <- 
  cytok |> 
  ggplot(aes(x = concentration, fill = cytokine)) +
  geom_histogram(bins = 50) +
  facet_wrap(cytokine ~ .) +
  guides(fill = "none") +
  ggtitle('Distribution of "raw" cytokine concentrations',
          subtitle = "with imputed ULOQ and LLOQ values")


g_raw_cytok 

Code
g_raw_cytok +
  scale_x_log10() 

From these first plots, it is clear that the data needs some transformation, potentially a log transformation if that also helps stabilizing the variance. We also see that for many samples, concentrations were either below or above the limits of quantification (LLOQ and ULOQ) and were imputed.

We also look at the distribution per sample:

Code
# cytok |> 
#   group_by(Barcode) |> 
#   mutate(median_log_concentration = median(log(concentration))) |>
#   ungroup() |> 
#   arrange(median_log_concentration) |> mutate(Barcode = Barcode |> fct_inorder()) |> 
#   ggplot(aes(x = Barcode, y = concentration)) +
#   # geom_boxplot(outlier.shape = NA, linewidth = 0.2) +
#   geom_point(aes(col = cytokine), size = 0.5, alpha = 0.6) + 
#   geom_point(stat = "summary", fun = median, col = "black") +
#   scale_y_log10() +
#   theme(axis.text.x = element_blank())


# cytok |> 
#   group_by(Barcode) |> 
#   mutate(median_log_concentration = median(log(concentration))) |>
#   ungroup() |> 
#   arrange(median_log_concentration) |> mutate(Barcode = Barcode |> fct_inorder()) |> 
#   ggplot(aes(x = Barcode, y = concentration)) +
#   # geom_boxplot(outlier.shape = NA, linewidth = 0.2) +
#   geom_point(aes(col = cytokine), size = 0.5, alpha = 0.6) + 
#   scale_y_log10() +
#   facet_grid(cytokine ~ .) +
#   theme(axis.text.x = element_blank())


cytok |> 
  group_by(Barcode) |> 
  mutate(median_log10_concentration_per_sample = median(log10(concentration))) |> 
  ungroup() |> 
  arrange(median_log10_concentration_per_sample) |> 
  mutate(Barcode = Barcode |> fct_inorder()) |> 
  group_by(cytokine) |> 
  mutate(median_log10_concentration_per_cytokine = median(log10(concentration))) |> 
  ungroup() |>
  arrange(median_log10_concentration_per_cytokine) |> 
  mutate(cytokine = cytokine |> fct_inorder()) |>
  ggplot(aes(x = Barcode, y = cytokine, fill = concentration |> log10())) +
  geom_tile() +
  theme(axis.text.x = element_blank()) +
  scale_fill_gradient(low = "seashell", high = "midnightblue")

We see that there might be a strong “size effect” in the data as samples with high levels of one cytokines appear to also have high levels of other cytokines.

Log transformation for variance stabilization.

We check if a \(\log_{10}\) transformation stabilizes the variance of the data.

Code
cytok <- 
  cytok |> mutate(log10_concentration = log10(concentration)) |> 
  select(Barcode, cytokine, concentration, log10_concentration, everything())


cytok_summary <- 
  cytok |> 
  group_by(cytokine) |> 
  summarize(
    `mean concentration` = mean(concentration, na.rm = TRUE),
    `var concentration` = var(concentration, na.rm = TRUE),
    `mean log10(concentration)` = mean(log10_concentration, na.rm = TRUE),
    `var log10(concentration)` = var(log10_concentration, na.rm = TRUE),
    .groups = "drop"
  )

cytok_summary |> 
  ggplot(aes(x = `mean concentration`, y = `var concentration`)) +
  geom_point() +
  geom_smooth(method = "lm", formula = "y ~ x") +
  scale_x_log10() + scale_y_log10() +
  ggtitle("Mean-Variance on raw concentration") +

cytok_summary |> 
  ggplot(aes(x = `mean log10(concentration)`, y = `var log10(concentration)`)) +
  geom_point() +
  geom_smooth(method = "lm", formula = "y ~ x") +
  ggtitle("Mean-Variance on log10-transformed concentration") +
  labs(caption = "Each dot is a cytokine")

The log10-transformation seems to stabilize the variance, although there might still be residual heteroskedasticity. We will use the log10-transformed data for downstream analyses.

We add an assay with the log10-transformed data to our multi-assay experiment object:

Code
cytokines <- MultiAssayExperiment::assay(mae, "cytokine")
transformed_cytokines <- cytokines |> log10()

transformed_cytokines_assay <- list()
transformed_cytokines_assay[["cytokine_log10"]] <-
  SummarizedExperiment::SummarizedExperiment(
    assay = transformed_cytokines
  )

mae <- c(mae, transformed_cytokines_assay)

Size effect

As seen on one of the plots above, there might be a strong size effect in the data. This size effect might be a biological size effect (co-regulation of all cytokines) or an artifact related to the amount of material that was on each swab.

First, let’s characterize that effect by computing the correlations between cytokine transformed concentrations.

Correlations between cytokines
Code
cytok_corr <- cor(transformed_cytokines |> t())
res <- Hmisc::rcorr(transformed_cytokines |> t())$P |> p.adjust("BH") |> range(na.rm = TRUE)
corrplot::corrplot(cytok_corr, method = "color", diag = FALSE, addCoef.col = "black", tl.col = "black", tl.srt = 45)

Correlation matrix of cytokine log10-transformed concentrations
Code
ggsave(g, filename = str_c(fig_out_dir(), "S4B.pdf"), width = 9, height = 9, device = cairo_pdf)
Associations between the mean z-log10(c) and various variables

Correlations are all highly positive (and statistically different from 0).

To assess whether this size effect might be a technical or biological effect, we examine the association between the per-sample-mean of the scaled log10 concentrations and a series of variables that may drive that biological effect. The variables we check for are:

  • sample’s visit
  • participants IDs
  • participants’ age
  • participants’ race
  • participants’ birth control method
  • the proportions of Lactobacillus in each sample
  • the proportion of non-iners Lactobacillus
Code
cytok <- 
  get_assay_long_format(
    mae, "cytokine_log10", feature_name = "cytokine", values_name = "log10_concentration"
    ) |> 
  left_join(
    get_assay_wide_format(mae, "prop_Lacto", add_colData = FALSE) |> dplyr::rename(MB_props = assay),
    by = join_by(Barcode)
  ) |> 
  mutate(Visit = AVISITN |> get_visit_labels())

cytok_summary <- 
  cytok |> 
  group_by(cytokine) |> mutate(scaled_log10_concentration = scale(log10_concentration)) |> ungroup() |> 
  mutate(prop_Lacto = MB_props$prop_Lacto, prop_non_iners_Lacto = MB_props$prop_non_iners_Lacto) |> 
  group_by(Barcode, USUBJID, Visit, AGE, RACEGR2, BC, prop_Lacto, prop_non_iners_Lacto) |> 
  summarize(`mean z-log10(conc.)` = mean(scaled_log10_concentration), .groups = "drop")
Code
cytok_summary |> 
  ggplot(aes(x = Visit, y = `mean z-log10(conc.)`)) +
  geom_path(aes(group = USUBJID), col = "gray70", alpha = 0.3) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(height = 0, width = 0.2) 

Code
lm(`mean z-log10(conc.)` ~ Visit, data = cytok_summary) |> summary()

Call:
lm(formula = `mean z-log10(conc.)` ~ Visit, data = cytok_summary)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.32782 -0.52355 -0.09988  0.39511  2.52702 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)    0.09340    0.04792   1.949   0.0515 .  
VisitPost-MTZ -0.31281    0.06714  -4.659 3.55e-06 ***
VisitWeek 4   -0.06772    0.06986  -0.969   0.3326    
VisitWeek 8   -0.02361    0.07037  -0.336   0.7373    
VisitWeek 12  -0.07429    0.07006  -1.060   0.2892    
VisitWeek 16   0.32725    0.19714   1.660   0.0972 .  
VisitWeek 20   0.24407    0.24844   0.982   0.3261    
VisitWeek 24  -0.09972    0.07220  -1.381   0.1675    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6895 on 1143 degrees of freedom
Multiple R-squared:  0.02936,   Adjusted R-squared:  0.02341 
F-statistic: 4.938 on 7 and 1143 DF,  p-value: 1.628e-05

Concentrations are a significantly lower at the post-MTZ visit, which is consistent with both a biological effect or a technical artifact due to amount on swab.

Code
cytok_summary <- 
  cytok_summary |> 
  group_by(USUBJID) |> mutate(median_mean = median(`mean z-log10(conc.)`)) |> ungroup() |> 
  arrange(median_mean) |> mutate(USUBJID = USUBJID |> fct_inorder()) 

cytok_summary |> 
  ggplot(aes(x = USUBJID, y = `mean z-log10(conc.)`)) +
  geom_boxplot(fill = "aquamarine3") # + geom_point(aes(color = Visit))

Code
lm(`mean z-log10(conc.)` ~ USUBJID, data = cytok_summary) |> anova()
Analysis of Variance Table

Response: mean z-log10(conc.)
           Df Sum Sq Mean Sq F value    Pr(>F)    
USUBJID   211 273.62 1.29680   4.255 < 2.2e-16 ***
Residuals 939 286.18 0.30477                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It also looks like there is a strong participant-effect.

Let’s explore this in more details for 10 random participants:

Code
selected_USUBJID <- levels(cytok_summary$USUBJID)
selected_USUBJID <- selected_USUBJID[seq(1, length(selected_USUBJID), len = 10)|> round() ] 

cytok |> 
  group_by(cytokine) |> 
  mutate(z_log10_concentration = scale(log10_concentration)) |> 
  ungroup() |> 
  filter(USUBJID %in% selected_USUBJID) |> 
  ggplot(aes(x = cytokine, y = Visit |> fct_rev(), fill = z_log10_concentration)) +
  geom_tile() +
  facet_grid(USUBJID ~ ., scales = "free", space = "free") +
  scale_fill_gradient2() +
  ylab("Visit")

Code
cytok_summary |>
  ggplot(aes(x = AGE, y = `mean z-log10(conc.)`, col = USUBJID, group = USUBJID)) +
  geom_path(position = position_dodge(width = 0.4)) +
  geom_point(position = position_dodge(width = 0.4), alpha = 0.7) +
  guides(col = "none")

Code
# lm_model <- lm(`mean z-log10(conc.)` ~ AGE + USUBJID, data = cytok_summary)
# summary(lm_model)

lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ AGE + (1|USUBJID), data = cytok_summary)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ AGE + (1 | USUBJID)
   Data: cytok_summary

REML criterion at convergence: 2203.4

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.4454 -0.6196 -0.1534  0.5402  3.7245 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 0.1695   0.4117  
 Residual             0.3059   0.5530  
Number of obs: 1151, groups:  USUBJID, 212

Fixed effects:
             Estimate Std. Error t value
(Intercept)  0.502676   0.149976   3.352
AGE         -0.016438   0.004741  -3.467

Correlation of Fixed Effects:
    (Intr)
AGE -0.976
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: mean z-log10(conc.)
     Chisq Df Pr(>Chisq)    
AGE 12.019  1  0.0005266 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a statistically significant, but very mild, decrease with age, which might also be consistent with a biological effect or a technical artifact?

Code
cytok_summary |>
  ggplot(aes(x = RACEGR2, y = `mean z-log10(conc.)`)) +
  geom_boxplot(outlier.shape = NA) +
  geom_path(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4)) +
  geom_point(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4), alpha = 0.7) +
  guides(col = "none")

Code
# lm_model <- lm(`mean z-log10(conc.)` ~ RACEGR2 + USUBJID, data = cytok_summary)
# summary(lm_model)

lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ RACEGR2 + (1|USUBJID), data = cytok_summary)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ RACEGR2 + (1 | USUBJID)
   Data: cytok_summary

REML criterion at convergence: 2210.8

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.4081 -0.6215 -0.1540  0.5512  3.7666 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 0.1830   0.4278  
 Residual             0.3055   0.5527  
Number of obs: 1151, groups:  USUBJID, 212

Fixed effects:
             Estimate Std. Error t value
(Intercept)  -0.04999    0.05593  -0.894
RACEGR2White  0.10408    0.07815   1.332
RACEGR2Other  0.02159    0.08850   0.244

Correlation of Fixed Effects:
            (Intr) RACEGR2W
RACEGR2Whit -0.716         
RACEGR2Othr -0.632  0.452  
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: mean z-log10(conc.)
         Chisq Df Pr(>Chisq)
RACEGR2 1.9349  2     0.3801

No significant differences here.

Code
cytok_summary |>
  filter(!is.na(BC)) |> 
  ggplot(aes(x = BC, y = `mean z-log10(conc.)`)) +
  geom_boxplot(outlier.shape = NA) +
  geom_path(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4)) +
  geom_point(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4), alpha = 0.7) +
  guides(col = "none")

Code
# lm_model <- lm(`mean z-log10(conc.)` ~ BC + USUBJID, data = cytok_summary)
# summary(lm_model)

lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ BC + (1|USUBJID), data = cytok_summary)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ BC + (1 | USUBJID)
   Data: cytok_summary

REML criterion at convergence: 2201.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.2238 -0.6172 -0.1421  0.5450  3.7989 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 0.1719   0.4146  
 Residual             0.3054   0.5526  
Number of obs: 1150, groups:  USUBJID, 211

Fixed effects:
                     Estimate Std. Error t value
(Intercept)          -0.00498    0.08537  -0.058
BCP only             -0.03329    0.15367  -0.217
BCIUD (Non-hormonal)  0.30147    0.13446   2.242
BCIUD (Hormonal)      0.10550    0.12110   0.871
BCNon-hormonal       -0.07735    0.09178  -0.843
BCunknown            -0.11842    0.16258  -0.728

Correlation of Fixed Effects:
            (Intr) BCPonl BCIUD(N BCIUD(H BCNn-h
BCP only    -0.536                              
BCIUD(Nn-h) -0.635  0.340                       
BCIUD(Hrmn) -0.704  0.377  0.447                
BCNon-hrmnl -0.877  0.470  0.557   0.620        
BCunknown   -0.492  0.264  0.313   0.353   0.446
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: mean z-log10(conc.)
    Chisq Df Pr(>Chisq)  
BC 13.733  5     0.0174 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a significant effect of the birth control method. Again, that may indicate a biological effect (copper IUD may increase overall inflammation) or a technical artifact (some birth controls may alter the mucus and how much material may be typically collected on swab).

Code
cytok_summary |> 
  ggplot(aes(x = prop_Lacto, y = `mean z-log10(conc.)`)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  
cytok_summary |> 
  ggplot(aes(x = prop_non_iners_Lacto, y = `mean z-log10(conc.)`)) +
  geom_point() +
  geom_smooth(se = FALSE)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 4 rows containing non-finite outside the scale range (`stat_smooth()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Code
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ prop_Lacto + (1|USUBJID), data = cytok_summary)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ prop_Lacto + (1 | USUBJID)
   Data: cytok_summary

REML criterion at convergence: 2198.3

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3427 -0.6161 -0.1482  0.5346  3.8015 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 0.1802   0.4245  
 Residual             0.3050   0.5523  
Number of obs: 1147, groups:  USUBJID, 211

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.05137    0.04068   1.263
prop_Lacto  -0.11173    0.04544  -2.459

Correlation of Fixed Effects:
           (Intr)
prop_Lacto -0.557
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: mean z-log10(conc.)
            Chisq Df Pr(>Chisq)  
prop_Lacto 6.0452  1    0.01394 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ prop_non_iners_Lacto + (1|USUBJID), data = cytok_summary)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ prop_non_iners_Lacto + (1 | USUBJID)
   Data: cytok_summary

REML criterion at convergence: 2202.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3529 -0.6256 -0.1529  0.5509  3.7684 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 0.1805   0.4248  
 Residual             0.3064   0.5535  
Number of obs: 1147, groups:  USUBJID, 211

Fixed effects:
                     Estimate Std. Error t value
(Intercept)           0.01094    0.03616   0.303
prop_non_iners_Lacto -0.06620    0.05531  -1.197

Correlation of Fixed Effects:
            (Intr)
prp_nn_nr_L -0.353
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: mean z-log10(conc.)
                      Chisq Df Pr(>Chisq)
prop_non_iners_Lacto 1.4325  1     0.2314

Here, we see a mild but statistically significant association with the proportion of Lactobacillus in the sample. The effect is very small and accounts for very little of the observed variance.

Based on these plots, we conclude that technical variability such as variability in the total amount of material collected on swabs might be as or more important than biological factors in driving the size effect observed in the cytokine concentrations.

Consequently, we adjust for this size effect, thereby attempting to correct for the total amount of material collected on swab. We do this adjustment by removing the first principal component of the scaled log concentrations.

PCA of the scaled log concentrations & size effect estimation

The PCA of the scaled log(concentrations) also highlight that strong size effect.

Code
cytokines <- 
  get_assay_wide_format(mae, "cytokine_log10") |> 
  left_join(
    get_assay_wide_format(mae, "CST", add_colData = FALSE) |> unnest(assay),
    by = "Barcode"
  ) |> 
  left_join(
    get_assay_wide_format(mae, "prop_Lacto", add_colData = FALSE) |> unnest(assay),
    by = "Barcode"
  )

cytokine_pca <- prcomp(cytokines$assay |> set_rownames(cytokines$Barcode), scale = TRUE)
Warning: Setting row names on a tibble is deprecated.
Code
cytokine_pca_var <- 
  cytokine_pca |>
  broom::tidy(matrix = "eigenvalues") 
Code
plot_pca_var(cytokine_pca_var)

Code
cytokine_pca_var|> 
  dplyr::rename(`% var` = percent, `cumulative % var` = cumulative) |> 
  gt() |> 
  fmt_percent(columns = c(`% var`, `cumulative % var`)) |> 
  fmt_number(columns = std.dev, decimals = 2)
PC std.dev % var cumulative % var
1 3.00 49.85% 49.85%
2 1.20 8.06% 57.90%
3 1.11 6.83% 64.74%
4 1.04 5.98% 70.72%
5 0.92 4.68% 75.41%
6 0.83 3.83% 79.23%
7 0.75 3.16% 82.39%
8 0.70 2.71% 85.10%
9 0.69 2.61% 87.71%
10 0.62 2.14% 89.84%
11 0.58 1.88% 91.72%
12 0.55 1.70% 93.42%
13 0.53 1.57% 95.00%
14 0.50 1.40% 96.40%
15 0.47 1.21% 97.61%
16 0.45 1.15% 98.76%
17 0.37 0.77% 99.52%
18 0.29 0.47% 100.00%
Code
plot_pca_correlation_circle(cytokines$assay, cytokine_pca, cytokine_pca_var)
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

The first principal component capture the size effect and we see that it is very correlated with the “mean scaled log10(concentration)” we computed above:

Code
cytok_summary |> 
  left_join(cytokine_pca$x |> as.data.frame() |> rownames_to_column("Barcode"), by = join_by(Barcode)) |> 
  ggplot(aes(x = PC1, y = `mean z-log10(conc.)`)) +
  geom_point()

Such that the relationships we highlighted above can also be seen in the PCA space:

Code
Sample_PCs <- cytokine_pca |> broom::augment(cytokines |> select(-assay))  

plot_pca_samples(Sample_PCs, pca_var = cytokine_pca_var, color_by = "prop_Lacto") +
  scale_color_gradient2(
    "Prop Lactobacillus",
    low = "red", high = "dodgerblue3", mid = "gray80", midpoint = 0.5) +
  theme(legend.position = "top") #+ coord_fixed()

PCA scores plot, colored by samples’ Lactobacillus proportion.
Code
Sample_PCs |> 
  ggplot(aes(x = prop_Lacto, y = .fittedPC1)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "lm") 
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Code
lme_model <- lme4::lmer(.fittedPC1 ~ prop_Lacto + (1|USUBJID), data = Sample_PCs)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: .fittedPC1 ~ prop_Lacto + (1 | USUBJID)
   Data: Sample_PCs

REML criterion at convergence: 5523.6

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3360 -0.6077 -0.1474  0.5510  3.6575 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 3.385    1.840   
 Residual             5.542    2.354   
Number of obs: 1147, groups:  USUBJID, 211

Fixed effects:
            Estimate Std. Error t value
(Intercept)   0.2342     0.1750   1.338
prop_Lacto   -0.5090     0.1940  -2.624

Correlation of Fixed Effects:
           (Intr)
prop_Lacto -0.552
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: .fittedPC1
            Chisq Df Pr(>Chisq)   
prop_Lacto 6.8844  1   0.008695 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
lme_model <- lme4::lmer(.fittedPC2 ~ prop_Lacto + (1|USUBJID), data = Sample_PCs)
summary(lme_model)
Linear mixed model fit by REML ['lmerMod']
Formula: .fittedPC2 ~ prop_Lacto + (1 | USUBJID)
   Data: Sample_PCs

REML criterion at convergence: 3148.7

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.1765 -0.5471  0.0158  0.5903  3.1020 

Random effects:
 Groups   Name        Variance Std.Dev.
 USUBJID  (Intercept) 0.2900   0.5385  
 Residual             0.7376   0.8588  
Number of obs: 1147, groups:  USUBJID, 211

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.85752    0.05708   15.02
prop_Lacto  -1.68852    0.06929  -24.37

Correlation of Fixed Effects:
           (Intr)
prop_Lacto -0.606
Code
car::Anova(lme_model, type = "II")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: .fittedPC2
            Chisq Df Pr(>Chisq)    
prop_Lacto 593.86  1  < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There see a very mild (but statistically significant) relationship between the first PC and the proportion of Lactobacillus, but a much stronger one with the 2nd PC.

Code
plot_pca_samples(Sample_PCs, pca_var = cytokine_pca_var, color_by = "CST") +
  scale_color_manual(
    values = get_topic_colors(Sample_PCs$CST |> unique() |> sort())
  )

PCA scores plot, colored by samples’ CST
Code
plot_pca_samples(Sample_PCs, pca_var = cytokine_pca_var, color_by = "BC") +
  stat_ellipse()
Too few points to calculate an ellipse
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_path()`).

PCA scores plot, colored by participants’ BC
Code
plot_pca_samples(
  Sample_PCs |> filter(AVISITN %in% c(0, 7)), 
  pca_var = cytokine_pca_var, color_by = "BC") +
  stat_ellipse() +
  facet_grid(AVISITN ~ ARM) 
Too few points to calculate an ellipse
Too few points to calculate an ellipse
Too few points to calculate an ellipse
Too few points to calculate an ellipse
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_path()`).

PCA scores plot, colored by participants’ BC
Code
plot_pca_samples(
  Sample_PCs |> 
    filter(PIPV) |> 
    mutate(Visit = str_c("Visit: ", AVISITN) |> factor()), 
  pca_var = cytokine_pca_var, color_by = "Visit") +
  facet_grid(ARM ~ Visit) +
  guides(col = "none")

Removal of size effect (PC1 substraction)

To remove that size effect, we remove the first PC from the scaled log2-transformed concentrations. This is almost equivalent to regressing out the mean scaled log2(concentration) from the data.

To remove the PC1 from the data, we set the first PC to 0 then multiply by the inverse of the rotation matrix.

Indeed, we have:

\(E = XR\) where \(E\) are the coordinates in the PC space, \(X\) are the coordinates in the original space (the scaled log2-transformed concentrations), and \(R\) is the rotation matrix.

Code
X <- cytokines$assay |> as.matrix() |> scale()
(cytokine_pca$x == X %*% cytokine_pca$rotation) |> all()
[1] TRUE

So, we have \(X = E R^{-1}\):

Code
X_hat <- cytokine_pca$x %*% solve(cytokine_pca$rotation)
all(round(X_hat,6) == round(X, 6))
[1] TRUE

Consequently, setting \(E[,1]\) to 0 (i.e., we project the first PC on its axis) and multiplying by the inverse rotation is equivalent to removing the first PC from the original data.

Code
PC1_removed <- 
  cytokine_pca$x |> as.data.frame() |> mutate(PC1 = 0) |> as.matrix()
PC1_removed <- PC1_removed %*% solve(cytokine_pca$rotation)

Effects of PC1 substraction

Code
PC1_removed_long <- 
  PC1_removed |> 
  as_tibble() |> 
  mutate(Barcode = cytokines$Barcode) |> 
  pivot_longer(
    cols = -Barcode,
    names_to = "cytokine",
    values_to = "PC1_removed_log10_conc"
  ) |> 
  left_join(cytokines |> select(-assay), by = join_by(Barcode))

Below, we show the relationships between the scaled log2(concentrations) and the PC1-removed log2(concentrations).

Code
log10_conc <- 
  PC1_removed_long |> 
  left_join(
    get_assay_long_format(
      mae, "cytokine_log10", add_colData = FALSE,
      feature_name = "cytokine", values_name = "log10_concentration"
    ) |> 
    group_by(cytokine) |> mutate(scaled_log10_concentration = scale(log10_concentration)) |> ungroup(), 
    by = join_by(Barcode, cytokine)
  )

ggplot(log10_conc, aes(x = scaled_log10_concentration, y = PC1_removed_log10_conc)) +
  geom_point(size = 0.5, alpha = 0.25) +
  facet_wrap(cytokine ~ .) +
  xlab("scaled log10(concentration)") + ylab("PC1-removed scaled log10(concentration)")

And the correlations between cytokines after PC1 removal

Code
dim(PC1_removed)
# rownames(PC1_removed)
# colnames(PC1_removed)
Code
n_cytok <- ncol(PC1_removed)
cytok_names <- colnames(PC1_removed)
PC1_subtracted_cytok_corr <- cor(PC1_removed)
PC1_subtracted_cytok_corr[PC1_subtracted_cytok_corr == 1] <- NA
res <- Hmisc::rcorr(PC1_removed)$P |> p.adjust("BH")
res <- res |> matrix(nrow = n_cytok, ncol = n_cytok) |> set_colnames(cytok_names) |> set_rownames(cytok_names)
corrplot::corrplot(PC1_subtracted_cytok_corr, method = "color", addCoef.col = "black", tl.col = "black", tl.srt = 45, p.mat = res, sig.level = 0.05)

Code
# cor(PC1_removed) |> 
#   as_tibble() |> 
#   mutate(cytokine_1 = colnames(PC1_removed)) |> 
#   pivot_longer(cols = -cytokine_1, names_to = "cytokine_2", values_to = "cor") |> 
#   filter(cor != 1) |> 
#   ggplot(aes(x = cytokine_1, y = cytokine_2, fill = cor)) +
#   geom_tile() +
#   scale_fill_gradient2(
#     low = "coral", mid = "white", high = "turquoise4", midpoint = 0,
#     breaks = seq(-1, 1, by = 0.25), limits = c(-1,1)
#     ) +
#   xlab("") + ylab("") +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
#   coord_fixed()

We see that now, some cytokines are correlated and some are anti-correlated. Of particular interest, we see now that IP-10 and IL-1b are anti-correlated, which is consistent with past literature.

In addition, we check whether see PC1-removed log2(concentrations) follow the expected (based on previous publications) relationships with proportion of Lactobacillus (and CSTs).

Code
ggplot(PC1_removed_long, aes(x = prop_Lacto, y = PC1_removed_log10_conc)) +
  geom_point(size = 0.35, alpha = 0.5) +
  geom_smooth(method = "lm", formula = "y ~ x") +
  facet_grid(. ~ cytokine) +
  scale_x_continuous(breaks = seq(0, 1, 0.5)) +
  ylab("PC1-removed scaled log10(concentration)")
Warning: Removed 72 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 72 rows containing missing values or values outside the scale range
(`geom_point()`).

Code
ggplot(PC1_removed_long |> filter(!is.na(CST), CST != "II"), 
       aes(x = CST, y = PC1_removed_log10_conc, fill = CST)) +
  geom_boxplot(outlier.size = 0.5) +
  facet_grid(. ~ cytokine) +
  scale_fill_manual(
    breaks = PC1_removed_long$CST |> unique() |> sort(),
    values = get_topic_colors(PC1_removed_long$CST |> unique() |> sort())
    ) +
  guides(fill = "none") +
  ylab("PC1-removed scaled log10(concentration)")

When considering original scaled log10-concentrations, we have:

Code
ggplot(log10_conc, aes(x = prop_Lacto, y = scaled_log10_concentration)) +
  geom_point(size = 0.35, alpha = 0.5) +
  geom_smooth(method = "lm", formula = "y ~ x") +
  facet_grid(. ~ cytokine) +
  scale_x_continuous(breaks = seq(0, 1, 0.5)) +
  ylab("Scaled log10(concentration)")
Warning: Removed 72 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 72 rows containing missing values or values outside the scale range
(`geom_point()`).

Code
ggplot(log10_conc |> filter(!is.na(CST), CST != "II"), 
       aes(x = CST, y = scaled_log10_concentration, fill = CST)) +
  geom_boxplot(outlier.size = 0.5) +
  facet_grid(. ~ cytokine) +
  scale_fill_manual(
    breaks = PC1_removed_long$CST |> unique() |> sort(),
    values = get_topic_colors(PC1_removed_long$CST |> unique() |> sort())
    ) +
  guides(fill = "none") +
  ylab("Scaled log10(concentration)")

The table below provide the correlations between the cytokine log10-concentrations and the proportion of Lactobacillus before and after PC1 removal.

Code
tmp <- 
  cbind(
    cor(cytokines$assay, cytokines$prop_Lacto, use = "complete", method = "spearman"),
    cor(PC1_removed, cytokines$prop_Lacto, use = "complete", method = "spearman")
  ) |> 
  set_colnames(c("log10(conc.)", "PC1 removed z-log10(conc.)"))

tmp |> 
  as.data.frame() |> 
  rownames_to_column("cytokine") |>
  pivot_longer(
    cols = -cytokine,
    names_to = "type",
    values_to = "correlation"
  ) |>
  mutate(type = type |> fct_inorder()) |>
  ggplot() +
  aes(y = cytokine, x = type, fill = correlation) +
  geom_tile() +
  geom_text(aes(label = round(correlation, 2)), size = 3) +
  scale_fill_gradient2() +
  xlab("") +
  ggtitle("Spearman correlations between cytokine concentrations and\nproportion of Lactobacillus for each cytokine")

Code
# tmp |> 
#   knitr::kable(caption = "Spearman correlations between cytokine concentrations and proportion of Lactobacillus for each cytokine")

These relationships appear similar to those that have been observed in past studies (and, of particular interest, studies relying on CVL samples which are probably less sensitive in terms of total material contained in samples).

Adding PC1-removed cytokines to the mae object

Since we believe that PC1 subtraction may in removing technical artifact due to the total amount on swab, we save these transformed concentrations to the mae object.

Code
transformed_cytokines_assay <- list()
transformed_cytokines_assay[["cytokine_log10_SE_corrected"]] <-
  SummarizedExperiment::SummarizedExperiment(
    assay = PC1_removed |> t() |> set_colnames(cytokines$Barcode)
  )

mae <- c(mae, transformed_cytokines_assay)

LLOQ and ULOQ values: Filtering cytokines and samples.

One important thing to note is that removing the size effect will lead the imputed LLOQ and ULOQ values (which were all imputed to the same values) to have different values based on the estimated effect size of each samples. For example, a LLOQ value in a sample with a high PC1 (~ a high total amount of material) will have a lower value after PC1-removal than a LLOQ value from a sample with a small PC1 (assumed to have less material on swab). So, the effect on LLOQ and ULOQ values depends on how well the size effect could be estimated, which, in turn, depends on the number of out-of-range values in the sample and our trust in cytokines’ transformed concentrations also depends on how many cytokines had observable data.

We should thus limit analyses relying on PC1-subtracted values to cytokines with sufficient data within the limits of quantification (and to samples with sufficient number of within-range cytokines).

Removing cytokines with concentrations beyond limits of quantification

Several cytokines have concentrations that were outside the limits of quantification (LLOQ and ULOQ) for a large fraction of samples. We remove these cytokines from the analysis.

Code
cytok <- 
  get_assay_long_format(mae, "cytokine_log10", feature_name = "cytokine", values_name = "log10_concentration") |> 
  left_join(
    get_assay_long_format(mae, "cytokine_log10_SE_corrected", feature_name = "cytokine", values_name = "transformed_concentration", add_colData = FALSE),
    by = join_by(Barcode, cytokine) 
  ) |> 
  group_by(cytokine) |> 
  mutate(
    value_type = 
      case_when(
        log10_concentration == min(log10_concentration) ~ "LLOQ",
        log10_concentration == max(log10_concentration) ~ "ULOQ",
        TRUE ~ "within_range"
      )
  ) |> 
  ungroup() |> 
  select(Barcode, cytokine, value_type, log10_concentration, transformed_concentration, everything())

fraction_at_LLOQ <- 
  cytok |> 
  group_by(cytokine) |> 
  summarize(
    n_LLOQ = sum(log10_concentration == min(log10_concentration)),
    n_ULOQ = sum(log10_concentration == max(log10_concentration)),
    n_tot = n()
  ) |> 
  mutate(
    `fraction below LLOQ` = n_LLOQ/n_tot,
    `fraction above ULOQ` = n_ULOQ/n_tot
    ) |> 
  arrange(-`fraction below LLOQ`)

fraction_at_LLOQ |> gt()
cytokine n_LLOQ n_ULOQ n_tot fraction below LLOQ fraction above ULOQ
IL-10 862 1 1151 0.7489139878 0.0008688097
IL-4 838 1 1151 0.7280625543 0.0008688097
IL-13 721 1 1151 0.6264118158 0.0008688097
IL-21 719 1 1151 0.6246741964 0.0008688097
TNFa 662 1 1151 0.5751520417 0.0008688097
IL-23 590 1 1151 0.5125977411 0.0008688097
IL-17 570 1 1151 0.4952215465 0.0008688097
MIP-1b 555 1 1151 0.4821894005 0.0008688097
ITAC 527 1 1151 0.4578627281 0.0008688097
IFNg 476 1 1151 0.4135534318 0.0008688097
IL-6 423 1 1151 0.3675065161 0.0008688097
IL-1b 313 4 1151 0.2719374457 0.0034752389
MIP-3a 213 1 1151 0.1850564726 0.0008688097
MIP-1a 171 1 1151 0.1485664639 0.0008688097
MIG 128 55 1151 0.1112076455 0.0477845352
IP-10 62 25 1151 0.0538662033 0.0217202433
IL-1a 9 7 1151 0.0078192876 0.0060816681
IL-8 1 399 1151 0.0008688097 0.3466550825
Code
threshold_LLOQ <- 0.6
threshold_ULOQ <- 0.3
Code
fraction_at_LLOQ <- 
 fraction_at_LLOQ |> 
  mutate(
    included = 
      (`fraction below LLOQ` < threshold_LLOQ) & 
      (`fraction above ULOQ` < threshold_ULOQ)
  )

included_cytokines <- fraction_at_LLOQ$cytokine[fraction_at_LLOQ$included] |> as.character()
excluded_cytokines <- fraction_at_LLOQ$cytokine[!fraction_at_LLOQ$included] |> as.character()
Code
cytok <- cytok |> left_join(fraction_at_LLOQ, by = join_by(cytokine)) 
Code
g <- 
ggplot(cytok, aes(x = log10_concentration, fill = included)) +
  geom_histogram(bins = 50) +
  facet_wrap(cytokine ~ ., nrow = 3) +
  theme(legend.position = "bottom") +
  labs(
    x = "log10(imputed concentrations)",
    caption = str_c("Samples were excluded if more than ", threshold_LLOQ*100,"% of their values were below the LLOQ or more than ",threshold_ULOQ*100 ,"% above the ULOQ")
    )

g

Code
ggplot(cytok, aes(x = transformed_concentration)) +
  geom_histogram(aes(fill = value_type), bins = 100) +
  geom_text(data = fraction_at_LLOQ |> mutate(included = ifelse(included, "included","excluded")), 
            aes(label = included, col = included), x = Inf, y = Inf, hjust = 1, vjust = 1) +
  facet_wrap(cytokine ~ ., nrow = 3) +
  theme(legend.position = "bottom") 

Code
ggsave(g, filename = str_c(fig_out_dir(), "S4A.pdf"), width = 10, height = 6, device = cairo_pdf)

Specifically, we exclude cytokines with concentration below the LLOQ for over 60% of samples or above the ULOQ for over 30% of samples.

Based on these criteria, we exclude IL-10, IL-4, IL-13, IL-21, IL-8 from downstream analyses.

We add a new assay to the mae object with the filtered cytokines.

Code
filtered_cytokines <- MultiAssayExperiment::assay(mae, "cytokine_log10_SE_corrected")
filtered_cytokines <- filtered_cytokines[included_cytokines,]

filtered_cytokines_assay <- list()
filtered_cytokines_assay[["cytokine_log10_SE_corrected_incl_cytokine"]] <-
  SummarizedExperiment::SummarizedExperiment(
    assay = filtered_cytokines
  )

mae <- c(mae, filtered_cytokines_assay)

In case one is interested in also doing the analysis on the non-PC1-substracted cytokine data, we also add this assay to the mae object.

Code
filtered_cytokines <- MultiAssayExperiment::assay(mae, "cytokine_log10")
filtered_cytokines <- filtered_cytokines[included_cytokines,]

filtered_cytokines_assay <- list()
filtered_cytokines_assay[["cytokine_log10_incl_cytokine"]] <-
  SummarizedExperiment::SummarizedExperiment(
    assay = filtered_cytokines
  )

mae <- c(mae, filtered_cytokines_assay)

Flagging samples with too few cytokines within limits of detection.

Code
cytok <- cytok |> mutate(Visit = AVISITN |> get_visit_labels()) 

cytok_filtered <- cytok |> filter(cytokine %in% included_cytokines)

sample_summary <- 
  cytok |>
  group_by(Barcode, Visit, USUBJID, PIPV, ARM) |>
  summarize(
    n_within_range = sum(value_type == "within_range"),
    n_tot = n(),
    .groups = "drop"
  ) |> 
  arrange(USUBJID, Visit)

sample_summary <- 
  sample_summary |> 
  group_by(USUBJID) |> mutate(mean_n_within_range = mean(n_within_range)) |> ungroup() |> 
  arrange(mean_n_within_range) |> mutate(USUBJID = USUBJID |> fct_inorder())
                                         
sample_summary |> 
  filter(PIPV) |> 
  ggplot(aes(x = Visit, y = USUBJID)) +
  geom_tile(aes(fill = n_within_range)) +
  scale_fill_gradient2(low  = "red", mid = "gray", high = "steelblue1", midpoint = 5.5, breaks = seq(0, 13, by = 3)) +
  facet_grid(ARM ~ ., scales = "free_y", space = "free") 

Again, we see a strong participant and visit effect on the number of out-of-range values.

Code
sample_summary |> 
  filter(PIPV) |> 
  ggplot(aes(x = Visit, y = n_within_range, fill = ARM)) +
  geom_boxplot() +
  scale_fill_manual(values = get_arm_colors()) 

Code
sample_summary |> 
  ggplot(aes(x = n_within_range, fill = ARM)) +
  geom_vline(xintercept = 5.5, linetype = 2, col = "red") +
  geom_histogram(binwidth = 0.5) +
  facet_grid(ARM ~ ., scales = "free_y") +
  scale_fill_manual(values = get_arm_colors()) 

We flag samples with strictly less than 5 cytokines within the limits of detection:

Code
flagged_samples <- sample_summary |> filter(n_within_range < 5) 
included_samples <- sample_summary |> filter(n_within_range >= 5)

flagged_samples |> 
  group_by(USUBJID, ARM) |> 
  summarize(n_flagged_samples = n(), .groups = "drop") |> 
  arrange(-n_flagged_samples) |> 
  gt()
USUBJID ARM n_flagged_samples
STI.00224 LACTIN-V 4
STI.00435 LACTIN-V 2
STI.00469 LACTIN-V 2
STI.00219 LACTIN-V 2
STI.00476 LACTIN-V 2
STI.00266 Placebo 2
STI.00307 Placebo 2
STI.00616 LACTIN-V 1
STI.00226 LACTIN-V 1
STI.00200 Placebo 1
STI.00987 Placebo 1
STI.00627 Placebo 1
STI.00268 LACTIN-V 1
STI.01200 LACTIN-V 1
STI.00285 LACTIN-V 1
STI.00405 LACTIN-V 1
STI.00461 Placebo 1
STI.00320 Placebo 1
STI.00401 LACTIN-V 1
STI.00332 Placebo 1
STI.00693 LACTIN-V 1
STI.00233 Placebo 1
STI.00452 LACTIN-V 1
STI.00632 Placebo 1
STI.00350 Placebo 1
STI.00337 LACTIN-V 1
STI.00406 LACTIN-V 1
STI.01152 LACTIN-V 1
STI.01089 Placebo 1
STI.00568 LACTIN-V 1
STI.01024 LACTIN-V 1
STI.00462 LACTIN-V 1
STI.00819 Placebo 1
STI.00453 LACTIN-V 1
STI.00758 Placebo 1
STI.00683 LACTIN-V 1
STI.00194 LACTIN-V 1
STI.00964 LACTIN-V 1
STI.00907 LACTIN-V 1
STI.00458 Placebo 1
STI.00394 LACTIN-V 1
STI.00434 Placebo 1
STI.00820 LACTIN-V 1
STI.01179 LACTIN-V 1
Code
flagged_samples |> 
  mutate(Arm = ARM |> str_replace("ACTIN", "actin")) |> 
  arrange(Visit, Arm) |> 
  group_by(Visit, Arm) |> 
  summarize(`n flagged samples` = n(), .groups = "drop") |> 
  pivot_wider(names_from = Arm, values_from = `n flagged samples`) |> 
  gt(
    rowname_col = "Visit",
    caption = "Number of flagged samples"
  ) |>
  grand_summary_rows(
    columns = -c("Visit"),
    fns =  list(label = "Total", id = "totals", fn = "sum"),
    fmt = ~ fmt_integer(.),
    side = "bottom"
  ) 
Number of flagged samples
Lactin-V Placebo
Pre-MTZ 7 2
Post-MTZ 11 7
Week 4 2 2
Week 8 7 1
Week 12 6 2
Week 24 2 4
Total 35 18
Code
transformed_cytokines <- MultiAssayExperiment::assay(mae, "cytokine_log10_SE_corrected_incl_cytokine")
transformed_cytokines <- transformed_cytokines[,included_samples$Barcode]

transformed_cytokines_assay <- list()
transformed_cytokines_assay[["cytokine_transformed"]] <-
  SummarizedExperiment::SummarizedExperiment(
    assay = transformed_cytokines
  )

mae <- c(mae, transformed_cytokines_assay)

In the cytokine_log10_incl_cytokine assay, we only add a column to the assay colData to flag these samples.

Code
tmp <- mae[["cytokine_log10_incl_cytokine"]]@colData |> as.data.frame() |> rownames_to_column("Barcode") |> as_tibble()
tmp <- 
  tmp |> 
  select(-any_of(c("n_within_range", "flagged"))) |> 
  left_join(
    sample_summary |> select(Barcode, n_within_range) |> mutate(flagged = n_within_range < 5), 
    by = join_by(Barcode)
    )

mae[["cytokine_log10_incl_cytokine"]]$n_within_range <- tmp$n_within_range
mae[["cytokine_log10_incl_cytokine"]]$flagged <- tmp$flagged
Code
log10_conc <- 
  log10_conc |> 
  select(-any_of("sample")) |> 
  left_join(included_samples |> select(Barcode) |> mutate(sample = "included"), by = join_by(Barcode)) |> 
  select(Barcode, cytokine, PC1_removed_log10_conc, scaled_log10_concentration, sample) |> 
  mutate(sample = sample |> str_replace_na("flagged"))
 

ggplot(log10_conc) +
  aes(x = scaled_log10_concentration, y = PC1_removed_log10_conc, col = sample) +
  geom_point(size = 0.5, alpha = 0.25) +
  facet_wrap(cytokine ~ .) +
  xlab("scaled log2(concentration)") + ylab("PC1-removed scaled log2(concentration)")

Reordering cytokines

Finally, to ease the interpretation of downstream analyses, we re-order the cytokines such that correlated ones are grouped together.

Code
PC1_removed <- get_assay_wide_format(mae, "cytokine_log10_SE_corrected")
PC1_removed <- PC1_removed$assay |> as.data.frame() |> set_rownames(PC1_removed$Barcode)
Code
cor_c <- cor(PC1_removed)
hclust_cor <- hclust((1 - cor_c) |> as.dist())
ordered_cytokines <- colnames(PC1_removed)[hclust_cor$order]
Code
corrplot::corrplot(cor_c[ordered_cytokines, ordered_cytokines], method = "color",  tl.col = "black", tl.srt = 45)

Modifying the existing mae assays:

Code
ac <- mae[["cytokine"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine"]] <- ac[j,]

ac <- mae[["cytokine_log10"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10"]] <- ac[j,]

ac <- mae[["cytokine_log10_SE_corrected"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10_SE_corrected"]] <- ac[j,]

ac <- mae[["cytokine_log10_SE_corrected_incl_cytokine"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10_SE_corrected_incl_cytokine"]] <- ac[j,]

ac <- mae[["cytokine_log10_incl_cytokine"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10_incl_cytokine"]] <- ac[j,]

ac <- mae[["cytokine_transformed"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_transformed"]] <- ac[j,]

Saving augmented mae

We can now save this augmented mae object.

Code
output_dir <- str_c(data_dir(), "03_augmented_mae/")

save_mae(mae, output_dir)
MAE is identical to the last one. Not saving.
Code
# If dropbox mounting was done manually, remember to unmount it.

As a reminder, the “raw” mae object had the following assays:

Code
raw_mae
A MultiAssayExperiment object of 7 listed
 experiments with user-defined names and respective classes.
 Containing an ExperimentList class object of length 7:
 [1] swabs: SummarizedExperiment with 2 rows and 1914 columns
 [2] ASV_16S_all: SummarizedExperiment with 9369 rows and 1176 columns
 [3] ASV_16S: SummarizedExperiment with 9369 rows and 1174 columns
 [4] tax_16S: SummarizedExperiment with 220 rows and 1174 columns
 [5] cytokine: SummarizedExperiment with 18 rows and 1151 columns
 [6] MG_all_Lc_strains: SummarizedExperiment with 301 rows and 1110 columns
 [7] MG_CTV05: SummarizedExperiment with 4 rows and 1110 columns
Functionality:
 experiments() - obtain the ExperimentList instance
 colData() - the primary/phenotype DataFrame
 sampleMap() - the sample coordination DataFrame
 `$`, `[`, `[[` - extract colData columns, subset, or experiment
 *Format() - convert into a long or wide DataFrame
 assays() - convert ExperimentList to a SimpleList of matrices
 exportClass() - save data to flat files

Now, the mae object has these assays + additional ones:

Code
mae
A MultiAssayExperiment object of 24 listed
 experiments with user-defined names and respective classes.
 Containing an ExperimentList class object of length 24:
 [1] swabs: SummarizedExperiment with 2 rows and 1914 columns
 [2] ASV_16S_all: SummarizedExperiment with 9369 rows and 1176 columns
 [3] ASV_16S: SummarizedExperiment with 9369 rows and 1174 columns
 [4] tax_16S: SummarizedExperiment with 220 rows and 1174 columns
 [5] cytokine: SummarizedExperiment with 18 rows and 1151 columns
 [6] MG_all_Lc_strains: SummarizedExperiment with 301 rows and 1110 columns
 [7] MG_CTV05: SummarizedExperiment with 4 rows and 1110 columns
 [8] ASV_16S_filtered: SummarizedExperiment with 780 rows and 1174 columns
 [9] ASV_16S_filtered_p: SummarizedExperiment with 780 rows and 1174 columns
 [10] tax_16S_p: SummarizedExperiment with 220 rows and 1174 columns
 [11] tax_CTV05_p: SummarizedExperiment with 222 rows and 1176 columns
 [12] prop_Lacto: SummarizedExperiment with 6 rows and 1174 columns
 [13] mb_categories: SummarizedExperiment with 1 rows and 1174 columns
 [14] mb_categories_wide: SummarizedExperiment with 3 rows and 1174 columns
 [15] CST: SummarizedExperiment with 3 rows and 1174 columns
 [16] topics_16S_7: SummarizedExperiment with 7 rows and 1174 columns
 [17] c_topics_16S_8: SummarizedExperiment with 8 rows and 1174 columns
 [18] c_topics_16S_9: SummarizedExperiment with 9 rows and 1174 columns
 [19] c_topics_16S_10_ctv05: SummarizedExperiment with 10 rows and 1108 columns
 [20] cytokine_log10: SummarizedExperiment with 18 rows and 1151 columns
 [21] cytokine_log10_SE_corrected: SummarizedExperiment with 18 rows and 1151 columns
 [22] cytokine_log10_SE_corrected_incl_cytokine: SummarizedExperiment with 13 rows and 1151 columns
 [23] cytokine_log10_incl_cytokine: SummarizedExperiment with 13 rows and 1151 columns
 [24] cytokine_transformed: SummarizedExperiment with 13 rows and 1098 columns
Functionality:
 experiments() - obtain the ExperimentList instance
 colData() - the primary/phenotype DataFrame
 sampleMap() - the sample coordination DataFrame
 `$`, `[`, `[[` - extract colData columns, subset, or experiment
 *Format() - convert into a long or wide DataFrame
 assays() - convert ExperimentList to a SimpleList of matrices
 exportClass() - save data to flat files

Session Info

Code
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Brussels
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] alto_0.1.0        ValenciaR_0.1.0   phyloseq_1.50.0   ggthemes_5.1.0   
 [5] gt_1.0.0          viridis_0.6.5     viridisLite_0.4.2 patchwork_1.3.0  
 [9] magrittr_2.0.3    lubridate_1.9.4   forcats_1.0.0     stringr_1.5.1    
[13] dplyr_1.1.4       purrr_1.0.4       readr_2.1.5       tidyr_1.3.1      
[17] tibble_3.2.1      ggplot2_3.5.2     tidyverse_2.0.0  

loaded via a namespace (and not attached):
  [1] RColorBrewer_1.1-3          rstudioapi_0.17.1          
  [3] jsonlite_2.0.0              MultiAssayExperiment_1.32.0
  [5] modeltools_0.2-24           nloptr_2.2.1               
  [7] corrplot_0.95               farver_2.1.2               
  [9] rmarkdown_2.29              fs_1.6.6                   
 [11] zlibbioc_1.52.0             vctrs_0.6.5                
 [13] multtest_2.62.0             minqa_1.2.8                
 [15] topicmodels_0.2-17          base64enc_0.1-3            
 [17] htmltools_0.5.8.1           S4Arrays_1.6.0             
 [19] BiocBaseUtils_1.8.0         curl_6.2.3                 
 [21] broom_1.0.8                 Rhdf5lib_1.28.0            
 [23] SparseArray_1.6.2           Formula_1.2-5              
 [25] rhdf5_2.50.2                sass_0.4.10                
 [27] YC_0.1.0                    htmlwidgets_1.6.4          
 [29] plyr_1.8.9                  igraph_2.1.4               
 [31] lifecycle_1.0.4             iterators_1.0.14           
 [33] pkgconfig_2.0.3             Matrix_1.7-3               
 [35] R6_2.6.1                    fastmap_1.2.0              
 [37] GenomeInfoDbData_1.2.13     rbibutils_2.3              
 [39] MatrixGenerics_1.18.1       digest_0.6.37              
 [41] colorspace_2.1-1            S4Vectors_0.44.0           
 [43] Hmisc_5.2-3                 GenomicRanges_1.58.0       
 [45] vegan_2.6-10                philentropy_0.9.0          
 [47] labeling_0.4.3              timechange_0.3.0           
 [49] httr_1.4.7                  abind_1.4-8                
 [51] mgcv_1.9-1                  compiler_4.4.2             
 [53] bit64_4.6.0-1               withr_3.0.2                
 [55] backports_1.5.0             htmlTable_2.4.3            
 [57] carData_3.0-5               MASS_7.3-65                
 [59] DelayedArray_0.32.0         biomformat_1.34.0          
 [61] permute_0.9-7               CVXR_1.0-15                
 [63] tools_4.4.2                 foreign_0.8-90             
 [65] ape_5.8-1                   nnet_7.3-19                
 [67] glue_1.8.0                  nlme_3.1-168               
 [69] rhdf5filters_1.18.1         checkmate_2.3.2            
 [71] cluster_2.1.8.1             reshape2_1.4.4             
 [73] ade4_1.7-23                 generics_0.1.4             
 [75] lpSolve_5.6.23              gtable_0.3.6               
 [77] tzdb_0.5.0                  data.table_1.17.4          
 [79] hms_1.1.3                   car_3.1-3                  
 [81] xml2_1.3.8                  XVector_0.46.0             
 [83] BiocGenerics_0.52.0         ggrepel_0.9.6              
 [85] foreach_1.5.2               pillar_1.10.2              
 [87] vroom_1.6.5                 splines_4.4.2              
 [89] lattice_0.22-6              survival_3.8-3             
 [91] gmp_0.7-5                   bit_4.6.0                  
 [93] tidyselect_1.2.1            tm_0.7-16                  
 [95] Biostrings_2.74.1           knitr_1.50                 
 [97] reformulas_0.4.1            gridExtra_2.3              
 [99] NLP_0.3-2                   IRanges_2.40.1             
[101] SummarizedExperiment_1.36.0 stats4_4.4.2               
[103] xfun_0.52                   Biobase_2.66.0             
[105] T4transport_0.1.3           matrixStats_1.5.0          
[107] stringi_1.8.7               UCSC.utils_1.2.0           
[109] boot_1.3-31                 yaml_2.3.10                
[111] evaluate_1.0.3              codetools_0.2-20           
[113] cli_3.6.5                   rpart_4.1.23               
[115] Rdpack_2.6.4                Rcpp_1.0.14                
[117] GenomeInfoDb_1.42.3         parallel_4.4.2             
[119] lme4_1.1-37                 Rmpfr_1.1-0                
[121] slam_0.1-55                 scales_1.4.0               
[123] crayon_1.5.3                rlang_1.1.6